# woebin woebin_plot woebin_ply woebin_adj

# converting vector (breaks & special_values) to data frame
split_vec2df = function(vec) {
  value = . = bin_chr = V1 = NULL

  if (!is.null(vec)) data.table(
    value=vec, bin_chr=vec
  )[, rowid := .I
  ][, strsplit(as.character(value), "%,%", fixed=TRUE), by = .(rowid, bin_chr)
  ][, .(rowid, bin_chr, value = ifelse(V1=="missing", NA, as.character(V1)) )]
}
# add missing to spl_val if there is na in dtm$value and
# missing is not specified in breaks and spl_val
add_missing_spl_val = function(dtm, breaks, spl_val) {
  value = NULL

  if (dtm[,any(is.na(value))]) {
    no_missing = !any(grepl('missing', c(breaks, spl_val)))
    if (no_missing) {
      spl_val = c('missing',spl_val)
    }
  }
  return(spl_val)
}
# split dtm into bin_sv and dtm (without speical_values)
dtm_binning_sv = function(dtm, breaks, spl_val) {
  binning_sv = value = . = y = variable = count = neg = pos = bin = bin_chr = NULL
  # spl_val
  spl_val = add_missing_spl_val(dtm, breaks, spl_val)
  if (!is.null(spl_val)) {
    # special_values from vector to data frame
    sv_df = split_vec2df(spl_val)

    # dtm_sv & dtm
    dtm_sv = setDT(dtm)[value %in% sv_df$value]
    dtm = setDT(dtm)[!(value %in% sv_df$value)]

    # if (nrow(dtm_sv) == 0) return(list(binning_sv=NULL, dtm=dtm))
    # binning_sv
    binning_sv = merge(
      dtm_sv[, .(count=.N, neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , by = value][,value:=as.character(value)],
      sv_df[,value:=as.character(value)],
      all.x = TRUE, by='value'
    )[, value:=ifelse(is.na(value), "missing", as.character(value))
    ][, .(bin=unique(bin_chr), count=sum(count), neg=sum(neg), pos=sum(pos), variable=unique(variable)), by=rowid
    ][, .(variable, bin, count, neg, pos)]
  }

  return(list(binning_sv=binning_sv, dtm=dtm))
}


# check empty bins for unmeric variable
check_empty_bins = function(dtm, binning, bin_close_right) {
  . = bin = value = variable = y = NULL

  # check empty bins
  ## break points from bin
  breaks_list = lapply(
    list(left="\\1", right="\\2"),
    function(x) setdiff(sub(binpattern('leftright_brkp', bin_close_right), x, unique(binning$bin)), c("Inf","-Inf")) )
  ## if there are empty bins
  if (!setequal(breaks_list$left, breaks_list$right)) {
    bstbrks = unique(c(-Inf, unique(breaks_list$right), Inf))
    binning = dtm[
      , bin := cut(value, bstbrks, right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
    ][, .(count=.N, neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , keyby = .(bin)
    ]

    # warning( paste0("The break points are modified into \'", paste0(breaks_list$right, collapse = ", "), "\'. There are empty bins based on the provided break points." ) )
  }
  return(binning)
}
# check zero in neg pos, remove bins that have zeros in neg or pos column
check_zero_negpos = function(dtm, binning, count_distr_limit = NULL, bin_close_right ) {
  brkp = neg = pos = count = merge_tolead = count_lag = count_lead = brkp2 = . = variable = bin = posprob = value = NULL

  while (binning[!is.na(brkp)][neg==0 | pos==0,.N] > 0) {
    # brkp needs to be removed if neg==0 or pos==0
    rm_brkp = binning[!is.na(brkp)][
      ,count := neg+pos
      ][,`:=`(
        count_lag=shift(count,type="lag", fill=nrow(dtm)+1),
        count_lead=shift(count,type="lead", fill=nrow(dtm)+1)
      )][, merge_tolead := count_lag > count_lead
         ][neg == 0 | pos == 0][count == min(count)]

    # set brkp to lead's or lag's
    shift_type = ifelse(rm_brkp[1,merge_tolead], 'lead', 'lag')
    binning = binning[
      ,brkp2 := shift(brkp,type=shift_type)
      ][brkp == rm_brkp[1,brkp], brkp := brkp2]

    # groupby brkp
    binning = binning[
      ,.(variable=unique(variable), bin=paste0(bin, collapse = "%,%"), neg=sum(neg), pos=sum(pos)), by=brkp
      ][, posprob:=pos/(neg+pos)]
  }

  # format bin
  if (is.numeric(dtm[,value])) {
    binning = binning[
      grepl("%,%",bin), bin := sub(binpattern('multibin', bin_close_right), "\\1\\2", bin)
      ][bin == 'missing', brkp := NA
      ][bin != 'missing', brkp := get_brkp_bin(bin, bin_close_right)]
  }
  return(binning)
}
# check count distri, remove bins that count_distribution rate less than count_distr_limit
check_count_distri = function(dtm, binning, count_distr_limit, bin_close_right ) {
  count_distr = count = neg = pos = brkp = merge_tolead = count_lag = count_lead = brkp2 = . = variable = bin = value = NULL
  if (!('count' %in% names(binning))) binning[, count := neg + pos]

  binning[, count_distr := (count)/sum(count)]
  while (binning[!is.na(brkp)][count_distr<count_distr_limit,.N] > 0) {
    # brkp needs to be removed if neg==0 or pos==0
    rm_brkp = binning[!is.na(brkp)][
      ,count_distr := (count)/sum(count)
      ][,`:=`(
        count_lag=shift(count_distr,type="lag", fill=nrow(dtm)+1),
        count_lead=shift(count_distr,type="lead", fill=nrow(dtm)+1)
      )][, merge_tolead := count_lag > count_lead
         ][count_distr<count_distr_limit][count_distr == min(count_distr)]

    # set brkp to lead's or lag's
    shift_type = ifelse(rm_brkp[1,merge_tolead], 'lead', 'lag')
    binning = binning[
      ,brkp2 := shift(brkp,type=shift_type)
      ][brkp == rm_brkp[1,brkp], brkp := brkp2]

    # groupby brkp
    binning = binning[
      ,.(variable=unique(variable), bin=paste0(bin, collapse = "%,%"), count=sum(count), neg=sum(neg), pos=sum(pos)), by=brkp
      ][, count_distr := (count)/sum(count)]
  }

  # format bin
  if (is.numeric(dtm[,value])) {
    binning = binning[
      grepl("%,%",bin), bin := sub(binpattern('multibin', bin_close_right), "\\1\\2", bin)
      ][bin == 'missing', brkp := NA
        ][bin != 'missing', brkp := get_brkp_bin(bin, bin_close_right)]
  }
  return(binning)
}


# required in woebin2 # return binning if breaks provided
#' @import data.table
woebin2_breaks = function(dtm, breaks, spl_val, bin_close_right ) {
  # global variables or functions
  value = bin = . = y = variable = count = pos = neg = V1 = posprob = bksv_list = bin_chr = NULL

  # breaks from vector to data frame
  bk_df = split_vec2df(breaks)

  # dtm $ binning_sv
  dtm_binsv_list = dtm_binning_sv(dtm, breaks, spl_val)
  dtm = dtm_binsv_list$dtm
  binning_sv = dtm_binsv_list$binning_sv
  if (dtm[,.N] == 0 || is.null(dtm)) return(list(binning_sv=binning_sv, binning=NULL))


  # binning
  if (is.numeric(dtm[,value])) {
    bstbrks = brk_numx_init(unique(bk_df$value), dtm$value, bin_close_right)

    binning = dtm[
      , bin := cut(value, bstbrks, right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
    ][, .(count = .N, neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , by = .(bin)
    ][order(bin)]
    # check empty bins
    binning = check_empty_bins(dtm, binning, bin_close_right=bin_close_right)


    # merge binning with bk_df
    if (bk_df[is.na(value),.N] == 1) {
      binning = merge(
        binning[, value:=sub(binpattern('leftright_brkp', bin_close_right),"\\2",bin)],
        bk_df,
        all.x = TRUE, by="value"
      )[order(rowid,value)][, bin:=ifelse(is.na(bin), "missing", as.character(bin))
      ][, .(bin=paste0(bin,collapse="%,%"), count = sum(count), neg=sum(neg), pos=sum(pos), variable=unique(variable)), by=rowid
      ][order(rowid)]
    }

  } else if (is.factor(dtm[,value]) || is.character(dtm[,value])) {
    dtm = dtm[,value := as.character(value)]

    # the values not specified in breaks_list
    diff_dt_brk = setdiff(dtm[,unique(value)], bk_df[,value])
    if (length(diff_dt_brk) > 0) {
      warning(sprintf('The categorical values (`%s`) are not specified in `breaks_list` for the column `%s`.', paste0(diff_dt_brk, collapse = ', '), dtm[1,variable]) )
      stop()
    }

    # merge binning with bk_df
    binning = merge(
      dtm, bk_df[,bin:=bin_chr], all.x = TRUE
    )[order(rowid, bin)][, .(count = .N, neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , by = .(rowid, bin)]

  }


  # # remove rowid column in binning data frame
  binning = binning[,rowid:=1][,rowid:=NULL]
  # # bind binning_sv and binning
  # if (setDT(binning_sv)[,.N] > 0) binning = rbind(binning_sv, binning)

  return(list(binning_sv=binning_sv, binning=binning))
}

# required in woebin2 # return initial binning
woebin2_init_bin = function(dtm, init_count_distr, breaks, spl_val, bin_close_right ) {
  # global variables or functions
  . = pos = posprob = bin = brkp = count = neg = value = variable = y = NULL

  # dtm $ binning_sv
  dtm_binsv_list = dtm_binning_sv(dtm, breaks, spl_val)
  dtm = dtm_binsv_list$dtm
  binning_sv = dtm_binsv_list$binning_sv
  if (is.null(dtm) || dtm[,.N]==0) return(list(binning_sv=binning_sv, initial_binning=NULL))


  # binning
  if (is.numeric(dtm[,value])) {
    # numeric variable ------
    xvalue = dtm[, value]

    # breaks vector & outlier
    iqr = IQR(xvalue, na.rm = TRUE)
    if (iqr == 0) {
      prob_down = 0.01
      prob_up = 0.99
    } else {
      prob_down = 0.25
      prob_up = 0.75
    }
    xrng = quantile(xvalue, probs = c(prob_down, prob_up), na.rm = TRUE)
    xvalue_rm_outlier = xvalue[which(xvalue >= xrng[1]-3*iqr & xvalue <= xrng[2]+3*iqr)]

    # number of initial binning
    n = trunc(1/init_count_distr)
    len_uniq_x = length(setdiff(unique(xvalue_rm_outlier), c(NA,Inf,-Inf)))
    if (len_uniq_x < n) n = len_uniq_x

    # initial breaks
    if (len_uniq_x < 10) {
      brk = setdiff(unique(xvalue_rm_outlier), c(NA, Inf, -Inf))
    } else {
      brk = suppressWarnings(pretty(xvalue_rm_outlier, n))
    }
    brk = brk_numx_init(brk, xvalue, bin_close_right)
    if (anyNA(xvalue)) brk = c(brk, NA)

    # initial binning datatable
    init_bin = dtm[
      , bin := cut(value, brk, right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
    ][, .(neg = sum(y==0), pos = sum(y==1), variable=unique(variable)) , by = bin
    ][order(bin)]
    # check empty bins
    init_bin = check_empty_bins(dtm, init_bin, bin_close_right = bin_close_right)

    init_bin = init_bin[
      , `:=`(brkp = get_brkp_bin(bin, bin_close_right), posprob = pos/(neg+pos))
    ][, .(variable, bin, brkp, neg, pos, posprob)]

  } else if ( is.logical(dtm[,value]) || is.factor(dtm[,value]) || is.character(dtm[,value]) ) {
    # other variable ------

    # initial binning datatable
    init_bin = dtm[
      , .(variable = unique(variable), neg = sum(y==0), pos = sum(y==1)), by=value
      ][, posprob := pos/(neg+pos)]

    # order by bin if is.factor, or by posprob if is.character
    if (is.logical(dtm[,value]) || is.factor(dtm[,value])) {
      init_bin = init_bin[
        order(value)
      ][, brkp := ifelse(is.na(value), NA, .I)
      ][, .(variable, bin=value, brkp, neg, pos, posprob)]

    } else {

      init_bin = init_bin[
        order(posprob)
        # next 3 lines make NA located at the last rows
      ][, brkp := ifelse(is.na(value), NA, .I)
      ][order(brkp)
      ][, brkp := ifelse(is.na(value), NA, .I)
      ][, .(variable, bin=value, brkp, neg, pos, posprob)]
    }

  }

  # remove brkp that neg == 0 or pos == 0 ------
  init_bin = check_zero_negpos(dtm, init_bin, bin_close_right=bin_close_right)[, count := neg + pos]
  return(list(binning_sv=binning_sv, initial_binning=init_bin))
}


# required in woebin2_tree # add 1 best break for tree-like binning
woebin2_tree_add_1brkp = function(dtm, initial_binning, count_distr_limit, bestbreaks=NULL, bin_close_right ) {
  # global variables or functions
  brkp = patterns = . = neg = pos = variable = count_distr = value = min_count_distr = bstbin = min_count_distr = total_iv = bstbin = brkp = bin = NULL

  # total_iv for all best breaks
  total_iv_all_breaks = function(initial_binning, bestbreaks, dtm_rows) {
    # best breaks set
    breaks_set = setdiff( initial_binning[,brkp], c(bestbreaks, -Inf, Inf, NA) )

    init_bin_all_breaks = copy(initial_binning)
    # loop on breaks_set
    for (i in breaks_set) {
      # best break + i
      bestbreaks_i = sort(c(bestbreaks, i))

      # best break datatable
      init_bin_all_breaks = init_bin_all_breaks[
        , paste0("bstbin",i) := cut(brkp, c(-Inf, bestbreaks_i, Inf), right = bin_close_right, dig.lab = 10, ordered_result = FALSE) ]
    }

    # best break dt
    total_iv_all_brks = melt(
      init_bin_all_breaks, id = c("variable", "neg", "pos"), variable.name = "bstbin", measure = patterns("bstbin.+")
    )[, .(neg = sum(neg), pos = sum(pos), variable = unique(variable))
      , by=.(bstbin, value)
    ][, count_distr := (neg+pos)/dtm_rows, by=bstbin
    ][!is.na(value), min_count_distr := min(count_distr), by=bstbin
    ][, .(total_iv = iv_01(neg, pos), variable = unique(variable), min_count_distr = min(min_count_distr,na.rm=TRUE)), by=bstbin
    ][, bstbin := as.numeric(sub("bstbin(.+)", "\\1", bstbin))][]

    return(total_iv_all_brks)
  }
  # binning add 1best break
  binning_add_1bst = function(initial_binning, bestbreaks) {
    value = bstbin = . = neg = pos = variable = woe = bin_iv = total_iv = bstbrkp = posprob = NULL # no visible binding for global variable

    if ( is.numeric(dtm[,value]) ) {
      binning_1bst_brk = initial_binning[
        , bstbin := cut(brkp, c(-Inf, bestbreaks, Inf), right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
        ][, .(variable=unique(variable), bin=unique(bstbin), neg = sum(neg), pos = sum(pos)) , by = bstbin
          ]

    } else if (is.logical(dtm[,value]) || is.factor(dtm[,value]) || is.character(dtm[,value]) ) {

      bestbreaks = setdiff(bestbreaks, min(initial_binning[,brkp]))

      binning_1bst_brk = initial_binning[
        , bstbin := cut(brkp, c(-Inf, bestbreaks, Inf), right = bin_close_right, dig.lab = 10, ordered_result = FALSE)
        ][, .(variable=unique(variable), bin = paste0(bin, collapse = "%,%"), neg = sum(neg), pos = sum(pos)), by = bstbin ]
    }

    binning_1bst_brk = binning_1bst_brk[
      order(bstbin)
      ][, total_iv := iv_01(neg, pos)
        ][, bstbrkp := get_brkp_bin(bstbin, bin_close_right)
          ][, .(variable, bin, bstbin, bstbrkp, neg, pos, total_iv)]

    return(binning_1bst_brk)
  }


  # adding 1 best breakpoint
  dtm_rows = nrow(dtm)
  total_iv_all_brks = total_iv_all_breaks(initial_binning, bestbreaks, dtm_rows)

  # bestbreaks: total_iv == max(total_iv) & min(count_distr) >= count_distr_limit
  if (total_iv_all_brks[min_count_distr >= count_distr_limit, .N > 0]) {
    bstbrk_max_iv = total_iv_all_brks[min_count_distr >= count_distr_limit][total_iv==max(total_iv)][, bstbin]
    # add 1best break to bestbreaks
    bestbreaks = unique(c(bestbreaks, bstbrk_max_iv[1]))
  }

  bin_add_1bst = binning_add_1bst(initial_binning, bestbreaks)
  return(bin_add_1bst)
}
# required in woebin2 # return tree-like binning
woebin2_tree = function(
  dtm,
  init_count_distr  = 0.02,
  count_distr_limit = 0.05,
  stop_limit        = 0.1,
  bin_num_limit     = 8,
  breaks            = NULL,
  spl_val           = NULL,
  bin_close_right
) {
  # global variables or functions
  brkp = bstbrkp = total_iv = count = neg = pos =  NULL

  # initial binning
  bin_list = woebin2_init_bin(dtm, init_count_distr=init_count_distr, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)
  initial_binning = bin_list$initial_binning
  binning_sv = bin_list$binning_sv

  if (nrow(initial_binning)<=1 || is.null(initial_binning)) {
    return(list(binning_sv=binning_sv, binning=initial_binning))
  }
  # initialize parameters
  ## length all breaks
  len_brks = initial_binning[!is.na(brkp), .N]
  ## param
  bestbreaks = NULL ## best breaks
  IVt1 = IVt2 = 1e-10
  IVchg = 1 ## IV gain ratio
  step_num = 1

  # best breaks from three to n+1 bins
  binning_tree = NULL
  while ( (IVchg >= stop_limit) & (step_num+1 <= min(bin_num_limit, len_brks)) ) {
    binning_tree = woebin2_tree_add_1brkp(dtm, copy(initial_binning), count_distr_limit, bestbreaks, bin_close_right=bin_close_right)
    # print(binning_tree)

    # update parameters
    ## best breaks
    bestbreaks = binning_tree[!(bstbrkp %in% c(-Inf, Inf)) & !is.na(bstbrkp), bstbrkp]
    ## information value
    IVt2 = binning_tree[1, total_iv]
    IVchg = IVt2/IVt1-1 ## ratio gain
    IVt1 = IVt2
    # print(IVchg)
    step_num = step_num + 1
  }

  if (is.null(binning_tree)) binning_tree = initial_binning

  return(list(binning_sv=binning_sv, binning=binning_tree[, count := neg + pos]))
  # return(binning_tree)
}
# examples
# system.time( binning_list <- woebin2_init_bin(dtm, init_count_distr=0.02, breaks =NULL, spl_val=NULL) )
# initial_binning=binning_list$initial_binning
# binning_sv = binning_list$binning_sv
# system.time( woebin2_tree_add_1brkp(dtm, initial_binning, count_distr_limit=0.05) )
# system.time( woebin2_tree(dtm, initial_binning, count_distr_limit=0.05) )


# required in woebin2 # return chimerge binning
#' @importFrom stats qchisq
woebin2_chimerge = function(
  dtm,
  init_count_distr  = 0.02,
  count_distr_limit = 0.05,
  stop_limit        = 0.1,
  bin_num_limit     = 8,
  breaks            = NULL,
  spl_val           = NULL,
  bin_close_right
) {
  .= a= a_colsum= a_lag= a_lag_rowsum= a_rowsum= a_sum= pos= bin= brkp= brkp2= chisq= count= count_distr= e= e_lag= chisq_lead= neg= negpos= merge_tolead =value= variable= NULL

  # [chimerge](http://blog.csdn.net/qunxingvip/article/details/50449376)
  # [ChiMerge:Discretization of numeric attributs](http://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf)

  # chisq = function(a11, a12, a21, a22) {
  #   A = list(a1 = c(a11, a12), a2 = c(a21, a22))
  #   Adf = do.call(rbind, A)
  #
  #   Edf =
  #     matrix(rowSums(Adf), ncol = 1) %*%
  #     matrix(colSums(Adf), nrow = 1) /
  #     sum(Adf)
  #
  #   sum((Adf-Edf)^2/Edf)
  # }
  # initial binning
  bin_list = woebin2_init_bin(dtm, init_count_distr=init_count_distr, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)
  initial_binning = bin_list$initial_binning
  binning_sv = bin_list$binning_sv

  if (nrow(initial_binning)<=1 || is.null(initial_binning)) {
    return(list(binning_sv=binning_sv, binning=initial_binning))
  }

  # function to create a chisq column in initial_binning
  add_chisq = function(initial_binning) {
  chisq_df = melt(initial_binning[!is.na(brkp)], id.vars = c("brkp", "variable", "bin"), measure.vars = c("neg", "pos"), variable.name = "negpos", value.name = "a"
  )[order(brkp)
  ][, a_lag := shift(a, type="lag"), by=.(negpos)
  ][, `:=`(
    a_rowsum = sum(a),
    a_lag_rowsum = sum(a_lag),
    a_colsum = a+a_lag,
    a_sum = sum(a+a_lag)), by='bin'
  ][, `:=`(
    e = (a_rowsum*a_colsum)/a_sum,
    e_lag = prod(a_lag_rowsum, a_colsum, na.rm = TRUE) / a_sum
  )][, .(chisq=sum((a-e)^2/e + (a_lag-e_lag)^2/e_lag)), by='bin']

  return(merge(initial_binning[,count:=neg+pos], chisq_df, all.x = TRUE, sort = FALSE))
  }

  # dtm_rows
  dtm_rows = nrow(dtm)
  # chisq limit
  chisq_limit = qchisq(1-stop_limit,1)
  # binning with chisq column
  binning_chisq = add_chisq(initial_binning)

  # param
  bin_chisq_min = binning_chisq[!is.na(chisq), min(chisq)]
  bin_count_distr_min = binning_chisq[!is.na(brkp), min((neg+pos)/dtm_rows)]
  bin_nrow = binning_chisq[,.N]
  # remove brkp if chisq < chisq_limit
  while (
    bin_chisq_min < chisq_limit ||
    bin_count_distr_min < count_distr_limit ||
    bin_nrow > bin_num_limit) {
    # brkp needs to be removed
    if (bin_count_distr_min < count_distr_limit) {
      rm_brkp = binning_chisq[,`:=`(
        count_distr = count/sum(count),
        chisq_lead = shift(chisq, type = "lead", fill = Inf)
      )][,merge_tolead := ifelse(is.na(chisq), TRUE, chisq > chisq_lead)
         ][!is.na(brkp)][order(count_distr)][1,]

    } else if (bin_chisq_min < chisq_limit) {
      rm_brkp = binning_chisq[, merge_tolead := FALSE][order(chisq, count)][1,]

    } else if (bin_nrow > bin_num_limit) {
      rm_brkp = binning_chisq[, merge_tolead := FALSE][order(chisq, count)][1,]

    } else break

    # groupby brkp
    shift_type = ifelse(rm_brkp[1,merge_tolead], 'lead', 'lag')
    binning_chisq = binning_chisq[
      ,brkp2 := shift(brkp,type=shift_type)
    ][brkp == rm_brkp[1,brkp], brkp := brkp2
    ][,.(variable=unique(variable), bin=paste0(bin, collapse = "%,%"), neg=sum(neg), pos=sum(pos)), by=brkp
    ]#[, posprob:=pos/(neg+pos)]

    # update
    ## add chisq to new binning data frame
    binning_chisq = add_chisq(binning_chisq)
    ## param
    bin_nrow = binning_chisq[,.N]
    if (bin_nrow == 1 || binning_chisq[!is.na(chisq), .N==0]) break
    bin_chisq_min = binning_chisq[!is.na(chisq), min(chisq)]
    bin_count_distr_min = binning_chisq[!is.na(brkp), min((neg+pos)/dtm_rows)]
  }

  # format bin # remove (.+\\)%,%\\[.+,)
  if (is.numeric(dtm[,value])) {
    binning_chisq = binning_chisq[grepl("%,%",bin), bin := sub(binpattern('multibin', bin_close_right), "\\1\\2", bin)]
  }

  return(list(binning_sv=binning_sv, binning=binning_chisq[, count := neg + pos]))
  # return(binning_chisq)
}


# required in woebin2 # return equal binning, supports numerical variables only
woebin2_equal = function(dtm, init_count_distr=0.02, count_distr_limit=0.05, stop_limit=0.1, bin_num_limit=8, breaks=NULL, spl_val=NULL, method='freq', bin_close_right ) {
  count = value = group = . = minv = maxv = bin = y = variable = pos = neg = posprob = NULL

  # dtm $ binning_sv
  dtm_binsv_list = dtm_binning_sv(dtm, breaks, spl_val)
  dtm = dtm_binsv_list$dtm
  binning_sv = dtm_binsv_list$binning_sv
  if (is.null(dtm) || dtm[,.N]==0) return(list(binning_sv=binning_sv, binning=NULL))

  # dt_sl = dtm[,.(label=y, datset=variable, score=value)]
  # dtm = dt_sl[,.(y=label, variable=datset, value=score)]


  # breaks
  unique_xvalue = dtm[, unique(value)]
  if (bin_num_limit >= length(unique_xvalue) ) {
    # in each value
    brkp = unique_xvalue

  } else {
    if (method == 'freq') {
      brkp = copy(dtm)[order(value)
                      ][, group := ceiling(.I/(.N/bin_num_limit))]
      if (bin_close_right) {
        brkp = brkp[, .(value = value[.N]), by = group][,value]
      } else {
        brkp = brkp[, .(value = value[ 1]), by = group][,value]
      }

    } else if (method == 'width') {
      brkp = seq(min(unique_xvalue, na.rm = TRUE), max(unique_xvalue, na.rm = TRUE), length.out = bin_num_limit+1)
      brkp = brkp[-c(1, length(brkp))]

    }
  }
  brkp = brk_numx_init(brkp, unique_xvalue, bin_close_right)

  binning_equal = dtm[, bin := cut(value, unique(brkp), right = bin_close_right, dig.lab = 10, ordered_result = F)
              ][, .(neg = sum(y==0), pos = sum(y==1), count = .N), keyby = .(variable, bin)
              ][, `:=`(brkp = get_brkp_bin(bin, bin_close_right), posprob = pos/(neg+pos))
              ][, .(variable, bin, brkp, count, neg, pos, posprob)]


  # create binning
  binning_equal = check_empty_bins(dtm, binning_equal, bin_close_right=bin_close_right)
  binning_equal = check_zero_negpos(dtm, binning_equal, bin_close_right=bin_close_right)
  binning_equal = check_count_distri(dtm, binning_equal, count_distr_limit, bin_close_right=bin_close_right)

  return(list(binning_sv=binning_sv, binning=binning_equal))
}

# required in woebin2 # # format binning output
binning_format = function(binning, bin_close_right ) {
  # global variables or functions
  . = pos = posprob = bin = bin_iv = neg = total_iv = variable = woe = is_sv = count = NULL

  # required columns in input binning: variable, bin, neg, pos
  if (!('count' %in% names(binning))) binning[, count := neg+pos]

  binning = binning[
    , posprob:=pos/(neg+pos)
  ][, woe := lapply(.SD, woe_01, pos), .SDcols = "neg"
  ][, bin_iv := lapply(.SD, miv_01, pos), .SDcols = "neg"
  ][, total_iv := sum(bin_iv)
  ][, bin := ifelse(is.na(bin) | bin=="NA", "missing", as.character(bin)) # replace NA by missing
  ][, .(variable, bin, count, count_distr=(neg+pos)/sum(neg+pos), neg, pos, posprob, woe, bin_iv, total_iv,  breaks = sub(binpattern('leftrightbrkp_missing', bin_close_right), "\\2\\3", bin), is_special_values=is_sv)]

  # move missing from last row to first
  if ( "missing" %in% binning$bin ) {
    binning = rbind(binning[bin=="missing"], binning[bin != "missing"])
  }

  return(binning[])
}

# woebin2
# This function provides woe binning for only two columns (one x and one y) data frame.
woebin2 = function(
  dtm,
  breaks            = NULL,
  spl_val           = NULL,
  init_count_distr  = 0.02,
  count_distr_limit = 0.05,
  stop_limit        = 0.1,
  bin_num_limit     = 8,
  bin_close_right   = FALSE,
  method            = "tree"
) {
  # global variables or functions
  . = pos = posprob = bin = bin_iv = neg = total_iv = variable = woe = is_sv = NULL


  # binning
  if (!anyNA(breaks) & !is.null(breaks)) {
    # 1.return binning if breaks provided
    bin_list = woebin2_breaks(dtm=dtm, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)

  } else {
    if (stop_limit == "N") {
      # binning of initial & specialvalues
      bin_list = woebin2_init_bin(dtm, init_count_distr=init_count_distr, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)

    } else {
      if (method == "tree") {
        # 2.tree-like optimal binning
        bin_list = woebin2_tree(dtm, init_count_distr, count_distr_limit, stop_limit, bin_num_limit, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)

      } else if (method == "chimerge") {
        # 2.chimerge optimal binning
        bin_list = woebin2_chimerge(dtm, init_count_distr, count_distr_limit, stop_limit, bin_num_limit, breaks=breaks, spl_val=spl_val, bin_close_right=bin_close_right)
      } else if (method %in% c('freq','width')) {
        # 3. in equal freq or width
        bin_list = woebin2_equal(dtm, init_count_distr, count_distr_limit, stop_limit, bin_num_limit, breaks=breaks, spl_val=spl_val, method = method, bin_close_right=bin_close_right)
      }
    }
  }

  # # binding binning_sv and binning
  if (any(sapply(bin_list, is.null))) {
    binning = rbindlist(bin_list)[, is_sv := names(bin_list)[!sapply(bin_list, is.null)]]
  } else {
    binning = rbindlist(bin_list, use.names = TRUE, fill = TRUE, idcol = 'is_sv')
  }
  binning = binning[, is_sv := is_sv == 'binning_sv']

  binning = binning_format(binning, bin_close_right=bin_close_right)
  return(binning)
}

# convert bins to breaks_list
bins_to_breaks = function(bins, dt, to_string=FALSE, save_name=NULL, bin_close_right ) {
  .= bin= bin2= is_special_values= variable= x_breaks= x_class = NULL

  # bins # if (is.list(bins)) rbindlist(bins)
  if (inherits(bins, 'list') && all(sapply(bins, is.data.frame))) bins = rbindlist(bins)
  bins = setDT(bins)

  # x variables
  xs_all = bins[,unique(variable)]

  # class of variables
  vars_class = data.table(
    variable = xs_all,
    x_class = dt[,sapply(.SD, class), .SDcols = xs_all]
  )

  # breaks
  bins_breakslist = bins[
    , bin2 := sub(binpattern('leftrightbrkp_missing', bin_close_right), "\\2\\3", bin)
    ][!(bin2 %in% c("-Inf","Inf","missing")) & !is_special_values
    ][vars_class, on="variable"
    ][, .(
      x_breaks = paste0(paste0("\"",bin2,"\""), collapse=", "),
      # x_breaks = paste(ifelse(x_class=="numeric", bin2, paste0("\"",bin2,"\"")), collapse=", "),
      x_class=unique(x_class)
    ), by=variable]

  if (to_string) {
    bins_breakslist = paste0(bins_breakslist[, paste0(variable, "=c(", x_breaks, ")")], collapse = ", \n ")
    bins_breakslist = paste0(c("breaks_list=list(", bins_breakslist, ")"), collapse = "\n ")
    bins_breakslist = paste0(
      sprintf("options(scorecard.bin_close_right = %s) \n", bin_close_right),
      bins_breakslist
    )

    if (!is.null(save_name)) {
      save_name = sprintf('%s_%s.R', save_name, format(Sys.time(),"%Y%m%d_%H%M%S"))
      writeLines(bins_breakslist, save_name, useBytes = TRUE)
      cat(sprintf('[INFO] The breaks_list is saved as %s\n', save_name))
      return()
    }
  }

  return(bins_breakslist)
}

# @param init_count_distr The minimum percentage of initial binning class number over total. Accepted range: 0.01-0.2; Defaults to 0.02, which means initial cut into 50 fine bins for continuous variables.

#' WOE Binning
#'
#' \code{woebin} generates optimal binning for numerical, factor and categorical variables using methods including tree-like segmentation or chi-square merge. \code{woebin} can also customizing breakpoints if the `breaks_list` was provided. The default `woe` is defined as ln(Pos_i/Neg_i). If you prefer ln(Neg_i/Pos_i), please set the argument `positive` as negative value, such as '0' or 'good'. If there is a zero frequency class when calculating woe, the zero will replaced by 0.99 to make the woe calculable.
#'
#' @param dt A data frame with both x (predictor/feature) and y (response/label) variables.
#' @param y Name of y variable.
#' @param x Name of x variables. Defaults to NULL. If x is NULL, then all columns except y and var_skip are counted as x variables.
#' @param var_skip Name of variables that will skip for binning. Defaults to NULL.
#' @param breaks_list List of break points, Defaults to NULL. If it is not NULL, variable binning will based on the provided breaks.
#' @param special_values the values specified in special_values will be in separate bins. Defaults to NULL.
#' @param stop_limit Stop binning segmentation when information value gain ratio less than the 'stop_limit' if using tree method; or stop binning merge when the chi-square of each neighbor bins are larger than the threshold under significance level of 'stop_limit' and freedom degree of 1 if using chimerge method. Accepted range: 0-0.5; Defaults to 0.1. If it is 'N', each x value is a bin.
# 'qchisq(1-stoplimit, 1)'
#' @param count_distr_limit The minimum count distribution percentage. Accepted range: 0.01-0.2; Defaults to 0.05.
#' @param bin_num_limit Integer. The maximum number of binning. Defaults to 8.
#' @param positive Value of positive class, defaults to "bad|1".
#' @param no_cores Number of CPU cores for parallel computation. Defaults to 2, if it sets to NULL then 90 percent of total cpu cores will be used.
#' @param print_step A non-negative integer. Defaults to 1. If print_step>0, print variable names by each print_step-th iteration. If print_step=0 or no_cores>1, no message is print.
#' @param method Four methods are provided, "tree" and "chimerge" for optimal binning that support both numerical and categorical variables, and 'width' and 'freq' for equal binning that support numerical variables only. Defaults to "tree".
#' @param save_breaks_list A string. The file name to save breaks_list. Defaults to None.
#' @param ignore_const_cols Logical. Ignore constant columns. Defaults to TRUE.
#' @param ignore_datetime_cols Logical. Ignore datetime columns. Defaults to TRUE.
#' @param check_cate_num Logical. Check whether the number of unique values in categorical columns larger than 50. It might make the binning process slow if there are too many unique categories. Defaults to TRUE.
#' @param replace_blank_inf Logical. Replace blank values with NA and infinite with -1. Defaults to TRUE.
#' @param ... Additional parameters.
#'
#' @return A list of data frames include binning information for each x variables.
#'
#' @seealso \code{\link{woebin_ply}}, \code{\link{woebin_plot}}, \code{\link{woebin_adj}}
#'
#' @examples
#' # load germancredit data
#' data(germancredit)
#'
#' # Example I
#' # binning of two variables in germancredit dataset
#' # using tree method
#' bins2_tree = woebin(germancredit, y="creditability",
#'    x=c("credit.amount","housing"), method="tree")
#' bins2_tree
#'
#' \dontrun{
#' # using chimerge method
#' bins2_chi = woebin(germancredit, y="creditability",
#'    x=c("credit.amount","housing"), method="chimerge")
#'
#' # binning in equal freq/width # only supports numerical variables
#' numeric_cols = c("duration.in.month", "credit.amount",
#'   "installment.rate.in.percentage.of.disposable.income", "present.residence.since",
#'   "age.in.years", "number.of.existing.credits.at.this.bank",
#'   "number.of.people.being.liable.to.provide.maintenance.for")
#' bins_freq  = woebin(germancredit, y="creditability", x=numeric_cols, method="freq")
#' bins_width = woebin(germancredit, y="creditability", x=numeric_cols, method="width")
#'
#' # y can be NULL if no label column in dataset
#' bins_freq_noy  = woebin(germancredit, y=NULL, x=numeric_cols)
#'
#' # Example II
#' # setting of stop_limit
#' # stop_limit = 0.1 (by default)
#' bins_x1 = woebin(germancredit, y = 'creditability', x = 'foreign.worker', stop_limit = 0.1)
#' # stop_limit = 'N', each x value is a bin
#' bins_x1_N = woebin(germancredit, y = 'creditability', x = 'foreign.worker', stop_limit = 'N')
#'
#' # Example III
#' # binning of the germancredit dataset
#' bins_germ = woebin(germancredit, y = "creditability")
#' # converting bins_germ into a data frame
#' # bins_germ_df = data.table::rbindlist(bins_germ)
#'
#' # Example IV
#' # customizing the breakpoints of binning
#' library(data.table)
#' dat = rbind(
#'   setDT(germancredit),
#'   data.table(creditability=sample(c("good","bad"),10,replace=TRUE)),
#'   fill=TRUE)
#'
#' breaks_list = list(
#'   age.in.years = c(26, 35, 37, "Inf%,%missing"),
#'   housing = c("own", "for free%,%rent")
#' )
#'
#' special_values = list(
#'   credit.amount = c(2600, 9960, "6850%,%missing"),
#'   purpose = c("education", "others%,%missing")
#' )
#'
#' bins_cus_brk = woebin(dat, y="creditability",
#'   x=c("age.in.years","credit.amount","housing","purpose"),
#'   breaks_list=breaks_list, special_values=special_values)
#'
#' # Example V
#' # save breaks_list as a R file
#' bins2 = woebin(germancredit, y="creditability",
#'    x=c("credit.amount","housing"), save_breaks_list='breaks_list')
#'
#' # Example VI
#' # setting bin closed on the right
#' options(scorecard.bin_close_right = TRUE)
#' binsRight = woebin(germancredit, y = 'creditability', x = 'age.in.years')
#' binsRight
#' # setting bin close on the left, the default setting
#' options(scorecard.bin_close_right = FALSE)
#' }
#'
#' @import data.table foreach
#' @importFrom stats IQR quantile setNames
#' @importFrom doParallel registerDoParallel stopImplicitCluster
#' @importFrom parallel detectCores makeCluster stopCluster
#' @export
woebin = function(
  dt, y, x=NULL, var_skip=NULL, breaks_list=NULL, special_values=NULL,
  stop_limit=0.1, count_distr_limit=0.05, bin_num_limit=8,
  positive="bad|1", no_cores=2, print_step=0L, method="tree", save_breaks_list=NULL,
  ignore_const_cols=TRUE, ignore_datetime_cols=TRUE, check_cate_num=TRUE, replace_blank_inf=TRUE, ...) {
  # global variable
  i = NULL

  # start time
  start_time = proc.time()
  # arguments ------
  kwargs = list(...)
  # print_info
  print_info = kwargs[['print_info']]
  if (is.null(print_info)) print_info = TRUE
  if (print_info) cat('[INFO] creating woe binning ... \n')

  # method
  method = try(match.arg(method, c("tree", "chimerge", 'freq', 'width')), silent = TRUE)
  if (inherits(method, 'try-error')) {
    warning("Incorrect inputs; method should be tree or chimerge. Parameter was set to default (tree).")
    method = "tree"
  }
  if (is.null(y) & !(method %in% c('freq', 'width'))) method = 'freq'

  # init_count_distr
  min_perc_fine_bin = kwargs[['min_perc_fine_bin']]
  init_count_distr = kwargs[['init_count_distr']]
  if (is.null(init_count_distr)) {
    init_count_distr <- ifelse(!is.null(min_perc_fine_bin), min_perc_fine_bin, 0.02)
  }
  # count_distr_limit
  min_perc_coarse_bin = kwargs[['min_perc_coarse_bin']]
  if (!is.null(min_perc_coarse_bin)) count_distr_limit = min_perc_coarse_bin
  # bin_num_limit
  max_num_bin = kwargs[['max_num_bin']]
  if (!is.null(max_num_bin)) bin_num_limit = max_num_bin
  # bin_close_right
  bin_close_right = getarg('bin_close_right')
  if (print_info & !is.null(breaks_list)) cat(sprintf("[INFO] The option bin_close_right was set to %s.\n", bin_close_right))

  # set dt as data.table
  dt = setDT(copy(dt))  #copy(setDT(dt))
  if (!is.null(x)) dt = dt[, c(y,x), with=FALSE]
  # check y
  if (!is.null(y)) dt = check_y(dt, y, positive)
  # remove constant columns
  if (ignore_const_cols) dt = check_const_cols(dt)
  # remove date/time columns
  if (ignore_datetime_cols) dt = check_datetime_cols(dt)
  # check categorical columns' unique values
  if (check_cate_num) check_cateCols_uniqueValues(dt, var_skip)
  # replace black with na
  if (replace_blank_inf) dt = rep_blank_na(dt)
  # x variable names
  xs = x_variable(dt, y, x, var_skip, method)
  xs_len = length(xs)
  # print_step
  print_step = check_print_step(print_step)
  # breaks_list
  breaks_list = check_breaks_list(breaks_list, xs)
  # special_values
  special_values = check_special_values(special_values, xs)
  # stop_limit
  stop_limit = check_stop_limit(stop_limit, xs)


  # init_count_distr range
  if ( init_count_distr<0.01 || init_count_distr>0.2 || !is.numeric(init_count_distr) ) {
    warning("Incorrect parameter specification; accepted init_count_distr parameter range is 0.01-0.2. Parameter was set to default (0.02).")
    init_count_distr = 0.02
  }
  # count_distr_limit
  if ( count_distr_limit<0.01 || count_distr_limit>0.2 || !is.numeric(count_distr_limit) ) {
    warning("Incorrect parameter specification; accepted count_distr_limit parameter range is 0.01-0.2. Parameter was set to default (0.05).")
    count_distr_limit = 0.05
  }
  # bin_num_limit
  if (!is.numeric(bin_num_limit)) {
    warning("Incorrect inputs; bin_num_limit should be numeric variable. Parameter was set to default (8).")
    bin_num_limit = 8
  }


  # binning ------
  # loop on xs
  # https://www.r-bloggers.com/how-to-go-parallel-in-r-basics-tips/
  # https://privefl.github.io/blog/a-guide-to-parallelism-in-r/
  no_cores = check_no_cores(no_cores, xs_len)

  bins = list()
  if (!is.null(y)) {
    ycol = dt[[y]]
  } else ycol = NA

  if (no_cores == 1) {
    for (i in 1:xs_len) {
      x_i = xs[i]
      dtm = data.table(y=ycol, variable=x_i, value=dt[[x_i]])
      # print xs
      if (print_step>0 & i %% print_step == 0) cat(paste0(format(c(i,xs_len)),collapse = "/"), x_i,"\n")

      # woebining on one variable
      bins[[x_i]] <-
        try(do.call('woebin2', args = list(
          dtm              = dtm,
          breaks           = breaks_list[[x_i]],
          spl_val          = special_values[[x_i]],
          init_count_distr = init_count_distr,
          count_distr_limit= count_distr_limit,
          stop_limit       = stop_limit[[x_i]],
          bin_num_limit    = bin_num_limit,
          bin_close_right  = bin_close_right,
          method           = method
        )), silent = TRUE)
    }

  } else {
    # type_psock_fork = ifelse(Sys.info()["sysname"] == 'Windows', 'PSOCK', 'FORK')
    cl = makeCluster(no_cores)#, type = type_psock_fork)
    registerDoParallel(cl)
    # registerDoParallel(no_cores)
    # run
    bins <-
      foreach(
        i = seq_len(xs_len),
        .combine = 'rbind',
        # .final = function(bs) {
        #   if (xs_len==1) bs = list(bs)
        #   setNames(bs, xs)
        # },
        .inorder = FALSE,
        .multicombine = TRUE,
        # .maxcombine = xs_len+1,
        .errorhandling = "remove"#,
        # .packages	= 'data.table',
        # .verbose = TRUE
        # .export = c('dt', 'xs', 'ycol', 'breaks_list', 'special_values', 'init_count_distr', 'count_distr_limit', 'stop_limit', 'bin_num_limit', 'bin_close_right', 'method')
      ) %dopar% {
        x_i = xs[i]
        dtm = data.table(y=ycol, variable=x_i, value=dt[[x_i]])
        # woebining on one variable
        try(do.call('woebin2', args = list(
          dtm              = dtm,
          breaks           = breaks_list[[x_i]],
          spl_val          = special_values[[x_i]],
          init_count_distr = init_count_distr,
          count_distr_limit= count_distr_limit,
          stop_limit       = stop_limit[[x_i]],
          bin_num_limit    = bin_num_limit,
          bin_close_right  = bin_close_right,
          method           = method
        )), silent = TRUE)
      }
    # finish
    stopCluster(cl)
    # stopImplicitCluster()
    bins = split(bins, by = 'variable')

  }

  # check errors in binning
  bins = bins[! sapply(bins, function(x) inherits(x, 'try-error'))]
  error_variables = setdiff(xs, names(bins))
  if (length(error_variables) > 0) {
    warning(sprintf('The following columns are removed from binning results due to errors:\n%s', paste0(error_variables, collapse=', ')))
    bins = bins[setdiff(names(bins), error_variables)]
  }

  # running time
  rs = proc.time() - start_time
  # hms
  if (rs[3] > 10 & print_info) cat(sprintf("[INFO] Binning on %s rows and %s columns in %s",nrow(dt),ncol(dt),sec_to_hms(rs[3])),"\n")
  # save breaks_list
  if (!is.null(save_breaks_list)) bins_to_breaks(bins, dt, to_string=TRUE, save_name=save_breaks_list, bin_close_right = bin_close_right)
  return(bins)
}




#' @import data.table
woepoints_ply1 = function(dtx, binx, x_i, woe_points, bin_close_right ) {
  # woe_points: "woe" "points"
  . = V1 = bin = woe = NULL

  # binx
  binx = binx[
    , bin:=as.character(bin)
  ][,.(unlist(strsplit(bin, "%,%", fixed=TRUE)),
       eval(parse(text = woe_points)) ), by=bin]

  # dtx
  ## cut numeric variable
  if ( is.numeric(dtx[[x_i]]) ) {
    binx_sv = binx[!grepl(binpattern('isbin', bin_close_right),V1)]
    binx_other = binx[grepl(binpattern('isbin', bin_close_right),V1)]

    dtx[[x_i]] = ifelse(
      dtx[[x_i]] %in% binx_sv$V1,
      dtx[[x_i]],
      as.character(cut(dtx[[x_i]], unique(c(-Inf, binx_other[bin != "missing", get_brkp_bin(V1, bin_close_right)], Inf)), right = bin_close_right, dig.lab = 10, ordered_result = FALSE))
    )

  }
  ## to charcarter, na to missing
  dtx[[x_i]] = as.character(dtx[[x_i]])
  dtx[[x_i]] = ifelse(is.na(dtx[[x_i]]), "missing", dtx[[x_i]])
  ## add rowid column
  dtx = setDT(dtx)[, rowid := .I]

  # rename binx
  setnames(binx, c("bin", x_i, paste(x_i, woe_points, sep="_")))
  # merge
  dtx_suffix = merge(dtx, binx, by=x_i, all.x = TRUE)
  dtx_suffix = setDT(dtx_suffix)[order(rowid)][, (c("rowid", "bin", x_i)) := NULL]

  return(dtx_suffix)
}

#' WOE/BIN Transformation
#'
#' \code{woebin_ply} converts original values of input data into woe or bin based on the binning information generated from \code{woebin}.
#'
#' @param dt A data frame.
#' @param bins Binning information generated from \code{woebin}.
#' @param to Converting original values to woe or bin. Defaults to woe.
#' @param no_cores Number of CPU cores for parallel computation. Defaults to 2, if it sets to NULL then 90 percent of total cpu cores will be used.
#' @param print_step A non-negative integer. Defaults to 1. If print_step>0, print variable names by each print_step-th iteration. If print_step=0 or no_cores>1, no message is print.
#' @param replace_blank_inf Logical. Replace blank values with NA and infinite with -1. Defaults to TRUE. This argument should be the same with \code{woebin}'s.
#' @param ... Additional parameters.
#'
#' @return A data frame with columns for variables converted into woe values.
#'
#' @seealso  \code{\link{woebin}}, \code{\link{woebin_plot}}, \code{\link{woebin_adj}}
#'
#' @examples
#' # load germancredit data
#' data(germancredit)
#'
#' # Example I
#' dt = germancredit[, c("creditability", "credit.amount", "purpose")]
#'
#' # binning for dt
#' bins = woebin(dt, y = "creditability")
#'
#' # converting to woe
#' dt_woe = woebin_ply(dt, bins=bins)
#' str(dt_woe)
#'
#' # converting to bin
#' dt_bin = woebin_ply(dt, bins=bins, to = 'bin')
#' str(dt_bin)
#'
#' \donttest{
#' # Example II
#' # binning for germancredit dataset
#' bins_germancredit = woebin(germancredit, y="creditability")
#'
#' # converting the values in germancredit to woe
#' # bins is a list which generated from woebin()
#' germancredit_woe = woebin_ply(germancredit, bins_germancredit)
#'
#' # bins is a data frame
#' bins_df = data.table::rbindlist(bins_germancredit)
#' germancredit_woe = woebin_ply(germancredit, bins_df)
#'
#' }
#'
#' @import data.table
#' @export
#'
woebin_ply = function(dt, bins, to='woe', no_cores=2, print_step=0L, replace_blank_inf=TRUE, ...) {

  # global variables or functions
  . = V1 = bin = variable = woe = i = databc_colomun_placeholder = NULL

  # start time
  start_time = proc.time()

  # arguments
  kwargs = list(...)
  # print info
  print_info = kwargs[['print_info']]
  if (is.null(print_info)) print_info = TRUE
  if (print_info) cat('[INFO] converting into woe values ... \n')
  # to woe/bin
  if (!is.null(kwargs[['value']])) to = kwargs[['value']]
  if (is.null(to) || !(to %in% c('woe', 'bin'))) to = 'woe'

  # set dt as data.table
  dt = setDT(copy(dt))
  # # remove date/time col
  # dt = rmcol_datetime_unique1(dt)
  # replace "" by NA
  if (replace_blank_inf) dt = rep_blank_na(dt)
  # print_step
  print_step = check_print_step(print_step)

  # bins # if (is.list(bins)) rbindlist(bins)
  if (inherits(bins, 'list') && all(sapply(bins, is.data.frame))) bins = rbindlist(bins)
  bins = setDT(bins)
  # bin_close_right
  bin_close_right = check_bcr(bins)

  # x variables
  xs_bin = bins[,unique(variable)]
  xs_dt = names(dt)
  xs = intersect(xs_bin, xs_dt)
  # loop on x variables
  xs_len = length(xs)

  # initial data set
  n = 0
  while (paste0('dat_col_placeholder',n) %in% xs) n = n+1
  dt_init = copy(dt)[, (paste0('dat_col_placeholder',n)) := 1][,(xs) := NULL]
  # the databc_colomun_placeholder will be remove in the result, in case dt_init is an empty dataframe

  # loop on xs # https://www.r-bloggers.com/how-to-go-parallel-in-r-basics-tips/
  no_cores = check_no_cores(no_cores)

  if (no_cores == 1) {
    dat = dt_init
    for (i in 1:xs_len) {
      x_i = xs[i]
      # print x
      if (print_step > 0 & i %% print_step == 0) cat(paste0(format(c(i,xs_len)),collapse = "/"), x_i,"\n")

      binx = bins[variable==x_i]
      dtx = dt[, x_i, with=FALSE]

      dat = cbind(dat, woepoints_ply1(dtx, binx, x_i, woe_points=to, bin_close_right = bin_close_right))
    }
  } else {
    # type_psock_fork = ifelse(Sys.info()["sysname"] == 'Windows', 'PSOCK', 'FORK')
    cl = makeCluster(no_cores)#, type = type_psock_fork)
    registerDoParallel(cl)
    # registerDoParallel(no_cores)
    # run
    dat <-
      foreach(
        i = 1:xs_len,
        .combine=cbind,
        .init = dt_init,
        .inorder = FALSE,
        .errorhandling = "pass"#,
        # .export = c('dt', 'bins', 'xs')
      ) %dopar% {
        x_i = xs[i]

        binx = bins[variable==x_i]
        dtx = dt[, x_i, with=FALSE]

        woepoints_ply1(dtx, binx, x_i, woe_points=to, bin_close_right = bin_close_right)
      }
    # finish
    stopCluster(cl)
    # stopImplicitCluster()
  }

  # running time
  rs = proc.time() - start_time
  # hms
  if (rs[3] > 10 & print_info) cat(sprintf("[INFO] Woe transformating on %s rows and %s columns in %s",nrow(dt),xs_len,sec_to_hms(rs[3])),"\n")

  return(dat[, (paste0('dat_col_placeholder',n)) := NULL])
}


# required in woebin_plot
#' @import data.table ggplot2
plot_bin = function(bin, title, show_iv, show_lineval = TRUE, show_barval = TRUE, line_value = 'posprob', line_color = 'blue', bar_color = NULL, ...) {
  # global variables or functions
  . = pos = posprob = posprob2 = count = count_distr = count_distr2 = count_num = neg = negpos = total_iv = value = variable = woe = lin_val = lin_val2 = lin_val_label = NULL

  if ( all(is.na(bin$neg)) || all(is.na(bin$pos)) ) return(NULL)
  # data
  ## y_right_max
  if (line_value == 'posprob') {
    y_right_max = ceiling(max(bin[[line_value]], na.rm=T)*10)
    if (y_right_max %% 2 ==1) y_right_max=y_right_max+1
    if (y_right_max - max(bin[[line_value]], na.rm=T)*10 <= 0.3) y_right_max = y_right_max+2
    y_right_max = y_right_max/10
    if (y_right_max>1 || y_right_max<=0 || is.na(y_right_max) || is.null(y_right_max)) y_right_max=1
  } else {
    y_right_max = max(bin[[line_value]])
  }
  ## y_right_min
  y_right_min = ifelse(line_value == 'posprob', 0, min(bin[[line_value]]))
  ## y_right_name
  y_right_name = ifelse(line_value == 'posprob', 'Positive probability', 'woe')

  ## y_left_max
  y_left_max = ceiling(max(bin$count_distr, na.rm=T)*10)/10
  if (y_left_max>1 || y_left_max<=0 || is.na(y_left_max) || is.null(y_left_max)) y_left_max = 1



  ## data set
  bin = setDT(bin)
  dat = bin[,.(
    variable, bin, count_num=count, count_distr2=count_distr, count_distr, neg=neg/sum(count), pos=pos/sum(count), posprob, woe
  )][, `:=`(
    bin = ifelse(is.na(bin), "NA", bin),
    lin_val2 = (get(line_value)-y_right_min)*(y_left_max/(y_right_max-y_right_min)),
    lin_val = round(get(line_value),4),
    rowid = .I
  )][, `:=`(
    bin = factor(bin, levels = bin),
    lin_val_label = paste0(round(lin_val*100, 1), "%")
  )][]
  if (line_value == 'woe') dat = dat[, lin_val_label := round(lin_val, 2)]



  dat_melt = melt(dat, id.vars = c("variable", "bin","rowid"), measure.vars =c("neg", "pos"), variable.name = "negpos")[
    ,negpos:=factor(negpos, levels=c( "pos", "neg")) ]

  # title
  if (!is.null(title)) title = paste0(title, "-")
  if (show_iv) {
    title_string = paste0(title, dat[1, variable],"  (iv:",bin[1,round(total_iv,4)],")")
  } else {
    title_string = paste0(title, dat[1, variable])
  }

  # plot
  p_bin = ggplot() +
    # geom_text(aes(label="@shichen.name/getpedr", x=dat[, x[.N], by=symbol][,V1[1]], y=Inf), vjust = -0.5, hjust = 1, color = "#F0F0F0") +
    # coord_cartesian(clip = 'off') +
    geom_bar(data=dat_melt, aes(x=bin, y=value, fill=negpos), stat="identity") +
    geom_line(data=dat, aes(x = rowid, y = lin_val2), colour = line_color) +
    geom_point(data=dat, aes(x = rowid, y = lin_val2), colour = line_color, shape=21, fill="white") +
    scale_y_continuous(limits = c(0, y_left_max), sec.axis = sec_axis(~./(y_left_max/(y_right_max-y_right_min))+y_right_min, name = y_right_name)) +
    labs(title = title_string, x=NULL, y = "Count distribution", fill=NULL) +
    theme_bw() +
    theme(
      legend.position="bottom", legend.direction="horizontal",
      axis.title.y.right = element_text(colour = line_color),
      axis.text.y.right  = element_text(colour = line_color, angle = 90, hjust = 0.5),
      axis.text.y = element_text(angle = 90, hjust = 0.5) )

  if (show_barval) p_bin = p_bin + geom_text(data=dat, aes(x = bin, y = count_distr2, label = paste0(round(count_distr2*100, 1), "%, ", count_num) ), vjust = 0.5)
  if (show_lineval) p_bin = p_bin + geom_text(data=dat, aes(x = rowid, y = lin_val2, label = lin_val_label), colour = line_color, vjust = -0.5)

  if (!is.null(bar_color)) p_bin = p_bin + scale_fill_manual(values= bar_color)

  return(p_bin)
}
#' WOE Binning Visualization
#'
#' \code{woebin_plot} create plots of count distribution and positive probability for each bin. The binning informations are generates by  \code{woebin}.
#'
#' @name woebin_plot
#' @param bins A list of data frames. Binning information generated by \code{woebin}.
#' @param x Name of x variables. Defaults to NULL. If x is NULL, then all columns except y are counted as x variables.
#' @param title String added to the plot title. Defaults to NULL.
#' @param show_iv Logical. Defaults to TRUE, which means show information value in the plot title.
#' @param line_value The value displayed as line. Accepted values are 'posprob' and 'woe'. Defaults to positive probability.
#' @param ... Additional parameters
#'
#' @return A list of binning graphics.
#'
#' @seealso  \code{\link{woebin}}, \code{\link{woebin_ply}}, \code{\link{woebin_adj}}
#'
#' @examples
#' # Load German credit data
#' data(germancredit)
#'
#' # Example I
#' bins1 = woebin(germancredit, y="creditability", x="credit.amount")
#'
#' p1 = woebin_plot(bins1)
#' print(p1)
#'
#' # modify line value
#' p1_w = woebin_plot(bins1, line_value = 'woe')
#' print(p1_w)
#'
#' # modify colors
#' p1_c = woebin_plot(bins1, line_color='#FC8D59', bar_color=c('#FFFFBF', '#99D594'))
#' print(p1_c)
#'
#' # show iv, line value, bar value
#' p1_iv = woebin_plot(bins1, show_iv = FALSE)
#' print(p1_iv)
#' p1_lineval = woebin_plot(bins1, show_lineval = FALSE)
#' print(p1_lineval)
#' p1_barval  = woebin_plot(bins1, show_barval = FALSE)
#' print(p1_barval)
#'
#' \donttest{
#' # Example II
#' bins = woebin(germancredit, y="creditability")
#' plotlist = woebin_plot(bins)
#' print(plotlist$credit.amount)
#'
#' # # save binning plot
#' # for (i in 1:length(plotlist)) {
#' #   ggplot2::ggsave(
#' #      paste0(names(plotlist[i]), ".png"), plotlist[[i]],
#' #      width = 15, height = 9, units="cm" )
#' #   }
#' }
#'
#' @import data.table ggplot2
#' @export
#'
woebin_plot = function(bins, x=NULL, title=NULL, show_iv = TRUE, line_value = 'posprob', ...) {
  # global variables or functions
  variable = NULL
  xs = x

  # kwargs = list(...)
  # line value
  if (!(line_value %in% c('posprob', 'woe'))) line_value = 'posprob'
  # line bar colors
  # line_color = kwargs[['line_color']]
  # if (is.null(line_color)) line_color = 'blue'
  # bar_color = kwargs[['bar_color']]

  # bins # if (is.list(bins)) rbindlist(bins)
  if (inherits(bins, 'list') && all(sapply(bins, is.data.frame))) bins = rbindlist(bins)
  bins = setDT(bins)

  # x variable names
  if (is.null(xs)) xs = bins[,unique(variable)]


  # plot export
  plotlist = list()
  for (i in xs) plotlist[[i]] = plot_bin(bins[variable==i], title, show_iv, line_value = line_value, ...)


  return(plotlist)
}



# print basic information in woebin_adj
woebin_adj_print_basic_info = function(dt, y, xs_adj, i, bins, bins_breakslist, ...) {
  x_i = xs_adj[i]
  xs_len = length(xs_adj)
  variable = x_breaks = NULL

  cat("--------", paste0(i, "/", xs_len), x_i, "--------\n")
  ## class
  cat(paste0("> class(",x_i,"): "),"\n",class(dt[[x_i]]),"\n","\n")
  ## summary
  cat(paste0("> summary(",x_i,"): "),"\n")
  print(summary(dt[[x_i]]))
  cat("\n")
  ## table
  if (length(table(dt[[x_i]])) < 10 || !is.numeric(dt[[x_i]])) {
    cat(paste0("> table(",x_i,"): "))
    print(table(dt[[x_i]]))
    cat("\n")
  } else {
    if ( is.numeric(dt[[x_i]])) {
      ht = hist(dt[[x_i]], plot = FALSE)
      plot(ht, main = x_i, xlab = NULL)
    }
  }
  ## current breaks
  breaks_bin = bins_breakslist[variable == x_i, x_breaks]

  cat("> Current breaks: \n", breaks_bin,"\n \n")
  ## woebin plotting
  brklst = list()
  brklst[x_i] = list(brk_txt2vector(breaks_bin))

  plist = woebin_plot(woebin(dt, y = y, x=x_i, breaks_list = brklst, print_info = FALSE), ...)
  print(plist[[1]])

}
# plot adjusted binning in woebin_adj
woebin_adj_break_plot = function(dt, y, x_i, breaks, stop_limit, sv_i, method, bin_close_right, ...) {
  bin_adj = brk_lst = spc_val = NULL

  brk_lst[x_i] = list(brk_txt2vector(breaks))
  spc_val[x_i] = list(sv_i)
  # if (stop_limit != 'N') stop_limit = 0.1

  # text_woebin = paste0("bin_adj=woebin(dt[,c(\"",x_i,"\",\"",y,"\"),with=F], \"",y,"\", breaks_list=list(",x_i,"=c(",breaks,")), special_values =list(",x_i,"=c(", sv_i, ")), ", ifelse(stop_limit=="N","stop_limit = \"N\", ",NULL), "print_step=0L, print_info=FALSE, method=\"",method,"\")")

  # eval(parse(text = text_woebin))
  bin_adj = woebin(dt = dt, y = y, x = x_i, breaks_list = brk_lst, special_values = spc_val, stop_limit = stop_limit, print_step = 0L, print_info=FALSE, method = method, ...)


  ## print adjust breaks
  breaks_bin = setdiff(sub(binpattern('leftrightbrkp_missing', bin_close_right), "\\2\\3", bin_adj[[1]]$bin), c("-Inf","Inf","missing"))
  breaks_bin = paste0(paste0("\"",breaks_bin,"\""), collapse=", ")
    # ifelse(
    # is.numeric(dt[[x_i]]),
    # paste0(breaks_bin, collapse=", "),
    # paste0(paste0("\"",breaks_bin,"\""), collapse=", "))
  cat("> Current breaks: ","\n",breaks_bin,"\n","\n")

  # print bin_adj
  print(woebin_plot(bin_adj, ...)[[1]])

  # # breaks
  # if (breaks == "" || is.null(breaks))
    breaks = breaks_bin

  return(breaks)
}
#' WOE Binning Adjustment
#'
#' \code{woebin_adj} interactively adjust the binning breaks.
#'
#' @param dt A data frame.
#' @param y Name of y variable.
#' @param bins A list of data frames. Binning information generated from \code{woebin}.
#' @param adj_all_var Logical, whether to show variables have monotonic woe trends. Defaults to TRUE
#' @param special_values The values specified in special_values will in separate bins. Defaults to NULL.
#' @param method Optimal binning method, it should be "tree" or "chimerge". Defaults to "tree".
#' @param save_breaks_list A string. The file name to save breaks_list. Defaults to None.
#' @param count_distr_limit The minimum count distribution percentage. Accepted range: 0.01-0.2; Defaults to 0.05. This argument should be the same with woebin's.
#' @param to Adjusting bins into breaks_list or bins_list. Defaults to breaks_list.
#' @param ... Additional parameters.
#'
#' @return A list of modified break points of each x variables.
#'
#' @seealso  \code{\link{woebin}}, \code{\link{woebin_ply}}, \code{\link{woebin_plot}}
#'
#' @examples
#' \dontrun{
#' # Load German credit data
#' data(germancredit)
#'
#' # Example I
#' dt = germancredit[, c("creditability", "age.in.years", "credit.amount")]
#' bins = woebin(dt, y="creditability")
#' breaks_adj = woebin_adj(dt, y="creditability", bins)
#' bins_final = woebin(dt, y="creditability",
#'                     breaks_list=breaks_adj)
#'
#' # Example II
#' binsII = woebin(germancredit, y="creditability")
#' breaks_adjII = woebin_adj(germancredit, "creditability", binsII)
#' bins_finalII = woebin(germancredit, y="creditability",
#'                     breaks_list=breaks_adjII)
#' }
#'
#' @import data.table
#' @importFrom utils menu
#' @importFrom graphics hist plot
#' @export
woebin_adj = function(dt, y, bins, adj_all_var=TRUE, special_values=NULL, method="tree", save_breaks_list=NULL, count_distr_limit=0.05, to='breaks_list', ...) {
  # global variables or functions
  . = V1 = posprob = posprob2 = bin2 = bin = bin_adj = count_distr = variable = x_breaks = x_class = NULL

  dt = setDT(copy(dt))
  # bins # if (is.list(bins)) rbindlist(bins)
  if (inherits(bins, 'list') && all(sapply(bins, is.data.frame))) bins = rbindlist(bins)
  bins = setDT(bins)

  # x variables
  xs_all = bins[,unique(variable)]
  if (adj_all_var == FALSE) {
    xs_adj = bins[
      !(bin == "missing" & count_distr >= count_distr_limit)
    ][, posprob2 := posprob >= shift(posprob, type = "lag"), by=variable
    ][!is.na(posprob2), length(unique(posprob2)), by=variable
    ][V1 > 1, variable]
  } else {
    xs_adj = xs_all
  }
  # length of adjusting variables
  xs_len = length(xs_adj)
  # special_values
  special_values = check_special_values(special_values, xs_adj)
  # stop_limit
  kwargs = list(...)
  stop_limit = 0.1
  if ('stop_limit' %in% names(kwargs)) stop_limit = kwargs[['stop_limit']]
    # print(stop_limit)
  stop_limit = check_stop_limit(stop_limit, xs_adj)
  # to
  to = match.arg(to, c('breaks_list', 'bins_list'))
  # bin_close_right
  bin_close_right = check_bcr(bins)

  # breakslist of bins
  bins_breakslist = bins_to_breaks(bins, dt, bin_close_right = bin_close_right)

  # converting break list to text
  bin_brk2txt = function(bins_breakslist) {
    breaks_list = paste0(bins_breakslist[, paste0(variable, "=c(", x_breaks, ")")], collapse = ", \n ")
    breaks_list = paste0(c("list(", breaks_list, ")"), collapse = "\n ")
    return(breaks_list)
  }
  # save break list
  save_brk_lst = function(breaks_list, save_name) {
    writeLines(breaks_list, sprintf('%s.R', save_name), useBytes = TRUE)
    cat(sprintf('[INFO] The breaks_list is saved as %s\n', sprintf('%s.R', save_name)))
  }

  # loop on adjusting variables
  if (xs_len == 0) {
    warning("The binning breaks of all variables are perfect according to default settings.")

    breaks_list = bin_brk2txt(bins_breakslist)
    return(breaks_list)
  }


  i = 1
  breaks_list = NULL
  while (i <= xs_len) {
    # x variable
    breaks = NULL
    x_i = xs_adj[i]
    sv_i = special_values[[x_i]]

    # basic information of x_i variable ------
    woebin_adj_print_basic_info(dt, y, xs_adj, i, bins, bins_breakslist, ...)

    # adjusting breaks ------
    adj_brk = menu2(choices = c("next", "yes", "back"), title=paste0("> Adjust breaks for (", i, "/", xs_len, ") ", x_i, "?"))

    while (adj_brk == 2 || adj_brk == 'save') {
      if (adj_brk == 2) {
        stp_lmt = stop_limit[[x_i]]
        # modify breaks adj_brk == 2
        breaks = readline("> Enter modified breaks: ")
        breaks = gsub("^[,\\.]+|[,\\.]+$", "", breaks)
        if (breaks == "N") {
          stp_lmt = "N"
          breaks = NULL
        }

        breaks <- try(woebin_adj_break_plot(dt, y, x_i, breaks, stp_lmt, sv_i, method=method, bin_close_right = bin_close_right, ...), silent = TRUE)
      } else { # adj_brk == 'save'
        save_name = ifelse(is.null(save_breaks_list), 'brk_lst', save_breaks_list)
        save_brk_lst(bin_brk2txt(bins_breakslist), save_name)
      }

      adj_brk = menu2(c("next", "yes", "back"), title=paste0("> Adjust breaks for (", i, "/", xs_len, ") ", x_i, "?"))
    }

    if (adj_brk == 3) {
      # go back adj_brk == 3
      i = ifelse(i > 1, i-1, i)
    } else if (adj_brk == 1) {
      # go next adj_brk == 1
      if (!(is.null(breaks) || breaks == "")) bins_breakslist[variable == x_i][["x_breaks"]]  <- breaks
    i = i + 1
    } else if (grepl('^go[1-9][0-9]*$', adj_brk)) {
      i = as.integer(sub('go','', adj_brk))
    }
  }

  cat(sprintf("options(scorecard.bin_close_right = %s) \n", bin_close_right),"\n")
  breaks_list = bin_brk2txt(bins_breakslist)
  cat(breaks_list,"\n")
  if (!is.null(save_breaks_list)) save_brk_lst(breaks_list, save_breaks_list)
  if (to == 'bins_list') breaks_list = woebin(dt, y, x = names(breaks_list), breaks_list = breaks_list, special_values = special_values, method = method, count_distr_limit = count_distr_limit, ...)
  return(breaks_list)
}
